package com.lanmaoly.demo;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

public class Test1 {
    private static int HASHLEN = 100;
    private static String file_dir = "D:\\学习\\实验室项目\\ImageNet图片爬取\\classify_url\\";
    private static String src_file = "D:\\学习\\实验室项目\\ImageNet图片爬取\\fall11_urls.txt";

    public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
        classify_url("D:\\学习\\实验室项目\\ImageNet图片爬取\\fall11_urls.txt");
// rank_filedata("2");
// String s = judgeFileCode(src_file);
// String s = codeString(src_file);
// System.out.println(s);
    }

    /**
     * 对一个文件进行排序
     */
    public static void rank_filedata(String filename) {
        String path1 = file_dir + filename + ".txt";
        String path2 = file_dir + filename + "_" + ".txt";
        List<String> list = reader_list(path1);
        System.out.println(list.size());
// 排序,通过泛型和匿名类来实现
        Collections.sort(list, new Comparator<String>() {
            @Override
            public int compare(String s1, String s2) {
                String h1 = s1.split(" ")[1];
                String h2 = s2.split(" ")[1];
                return h1.compareTo(h2);
            }
        });
        writer_list(list, path2);
    }

    /**
     * 读取文件，返回list
     *
     * @param path
     * @return
     */
    public static List reader_list(String path) {
        List<String> lineList = new ArrayList();
        try {
            BufferedReader reader = new BufferedReader(new FileReader(path));
            String line = reader.readLine();
            while (null != line) {
                lineList.add(line);
                line = reader.readLine();
            }
            reader.close();
            return lineList;
        } catch (Exception e) {
// TODO: handle exception
            e.printStackTrace();
        }
        return null;
    }

    /**
     * 将List写入文件
     *
     * @param list
     * @param path
     */
    public static void writer_list(List list, String path) {
        try {
            BufferedWriter writer = new BufferedWriter(new FileWriter(path));
            for (int i = 0; i < list.size(); i++) {
                String line = (String) list.get(i);
                writer.write(line + "\r\n");
            }
            writer.close();
        } catch (Exception e) {
// TODO: handle exception
            e.printStackTrace();
        }
    }

    /**
     * 从文件中逐行读取数据，分类写入0-99个文件
     */
    public static void classify_url(String path) {
        try {
            BufferedReader reader;
            String filecode = judgeFileCode(path);
            reader = new BufferedReader(new InputStreamReader(new FileInputStream(path), filecode));
// BufferedReader reader = new BufferedReader(new FileReader(path));
            String line = reader.readLine();
            int line_num = 0;
// while(line_num<4101000){
// reader.readLine();
// line_num++;
// }
            while (null != line) {
                try {
                    String host = new URL(line.split(" ")[1]).getHost();
                    int type = hash(host.toCharArray());
// writer(type+"", line);
                } catch (Exception e) {
// TODO: handle exception
                    e.printStackTrace();
                }
                line = reader.readLine();
                line_num++;
                if (line_num % 100 == 0) {
// System.out.println(line_num);
                    char[] cc = line.toCharArray();
                    for (char c : cc) {
                        if (isCnorEn(c)) {
                            System.out.println(line);
                            break;
                        }
                    }
// break;
                }
            }
            reader.close();
        } catch (Exception e) {
// TODO: handle exception
            e.printStackTrace();
        }
    }

    /**
     * 判断是中文还是英文字符
     */
    static boolean isCnorEn(char c) {
        if ((c >= 0x0391 && c <= 0xFFE5) // 中文字符
                || (c >= 0x0000 && c <= 0x00FF)) // 英文字符
            return true;
        return false;
// if ((c >= 0x0391 && c <= 0xFFE5) // 英文字符
// ) //
// return true;
// return false;
    }

    /**
     * 给定一个字符串，返回hash后的int值
     *
     * @param word
     * @return
     */
    public static int hash(char[] word) {
        int index = 0;
        int i = 0;
        while (i < word.length) {
            index += index * 31 + word[i];
            i++;
        }
        return Math.abs(index % HASHLEN);
    }

    /**
     * 将line写入filename中（文件不存在则先建立）
     *
     * @param filename
     * @param line
     */
    public static void writer(String filename, String line) {
        String path = file_dir + filename + ".txt";
        try {
            File file = new File(path);
            if (!file.isFile()) {
                file.createNewFile();
            }
            String filecode = judgeFileCode(src_file);
            OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(path, true), "GBK");
// BufferedWriter writer = new BufferedWriter(new FileWriter(path, true));
            if (null != line) {
                writer.write(line + "\r\n");
            }
            writer.close();
        } catch (Exception e) {
// TODO: handle exception
            e.printStackTrace();
        }
    }

    public static String judgeFileCode(String path) {
        try {
            File file = new File(path);
            InputStream in = new java.io.FileInputStream(file);
            byte[] b = new byte[3];
            in.read(b);
            in.close();
            if (b[0] == -17 && b[1] == -69 && b[2] == -65) {
// System.out.println(file.getName() + "：编码为UTF-8");
                return "UTF-8";
            } else {
// System.out.println(file.getName() + "：可能是GBK，也可能是其他编码");
                return "GBK";
            }
        } catch (Exception e) {
// TODO: handle exception
        }
        return null;
    }

    /**
     * 判断文件的编码格式
     *
     * @param fileName :file
     * @return 文件编码格式
     * @throws Exception
     */
    public static String codeString(String fileName) throws Exception {
        BufferedInputStream bin = new BufferedInputStream(new FileInputStream(fileName));
        int p = (bin.read() << 8) + bin.read();
        String code = null;
//其中的 0xefbb、0xfffe、0xfeff、0x5c75这些都是这个文件的前面两个字节的16进制数
        switch (p) {
            case 0xefbb:
                code = "UTF-8";
                break;
            case 0xfffe:
                code = "Unicode";
                break;
            case 0xfeff:
                code = "UTF-16BE";
                break;
            case 0x5c75:
                code = "ANSI|ASCII";
                break;
            default:
                code = "GBK";
        }
        return code;
    }
}
